1.6. Fetal_Health_Prediction#
1.7. Step 1 :#
Imported the libraries and load the initial dataset - fetal_health.csv.
import numpy as np
import pandas as pd
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 2
1 import numpy as np
----> 2 import pandas as pd
ModuleNotFoundError: No module named 'pandas'
import pandas as pd
file_path = 'fetal_health.csv'
data = pd.read_csv(file_path)
print(data.head())
baseline value accelerations fetal_movement uterine_contractions \
0 120.0 0.000 0.0 0.000
1 132.0 0.006 0.0 0.006
2 133.0 0.003 0.0 0.008
3 134.0 0.003 0.0 0.008
4 132.0 0.007 0.0 0.008
light_decelerations severe_decelerations prolongued_decelerations \
0 0.000 0.0 0.0
1 0.003 0.0 0.0
2 0.003 0.0 0.0
3 0.003 0.0 0.0
4 0.000 0.0 0.0
abnormal_short_term_variability mean_value_of_short_term_variability \
0 73.0 0.5
1 17.0 2.1
2 16.0 2.1
3 16.0 2.4
4 16.0 2.4
percentage_of_time_with_abnormal_long_term_variability ... histogram_min \
0 43.0 ... 62.0
1 0.0 ... 68.0
2 0.0 ... 68.0
3 0.0 ... 53.0
4 0.0 ... 53.0
histogram_max histogram_number_of_peaks histogram_number_of_zeroes \
0 126.0 2.0 0.0
1 198.0 6.0 1.0
2 198.0 5.0 1.0
3 170.0 11.0 0.0
4 170.0 9.0 0.0
histogram_mode histogram_mean histogram_median histogram_variance \
0 120.0 137.0 121.0 73.0
1 141.0 136.0 140.0 12.0
2 141.0 135.0 138.0 13.0
3 137.0 134.0 137.0 13.0
4 137.0 136.0 138.0 11.0
histogram_tendency fetal_health
0 1.0 2.0
1 0.0 1.0
2 0.0 1.0
3 1.0 1.0
4 1.0 1.0
[5 rows x 22 columns]
1.8. Step 2 :#
Look into the data informations to check for any null values.
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 baseline value 2126 non-null float64
1 accelerations 2126 non-null float64
2 fetal_movement 2126 non-null float64
3 uterine_contractions 2126 non-null float64
4 light_decelerations 2126 non-null float64
5 severe_decelerations 2126 non-null float64
6 prolongued_decelerations 2126 non-null float64
7 abnormal_short_term_variability 2126 non-null float64
8 mean_value_of_short_term_variability 2126 non-null float64
9 percentage_of_time_with_abnormal_long_term_variability 2126 non-null float64
10 mean_value_of_long_term_variability 2126 non-null float64
11 histogram_width 2126 non-null float64
12 histogram_min 2126 non-null float64
13 histogram_max 2126 non-null float64
14 histogram_number_of_peaks 2126 non-null float64
15 histogram_number_of_zeroes 2126 non-null float64
16 histogram_mode 2126 non-null float64
17 histogram_mean 2126 non-null float64
18 histogram_median 2126 non-null float64
19 histogram_variance 2126 non-null float64
20 histogram_tendency 2126 non-null float64
21 fetal_health 2126 non-null float64
dtypes: float64(22)
memory usage: 365.5 KB
data.isnull().sum()
baseline value 0
accelerations 0
fetal_movement 0
uterine_contractions 0
light_decelerations 0
severe_decelerations 0
prolongued_decelerations 0
abnormal_short_term_variability 0
mean_value_of_short_term_variability 0
percentage_of_time_with_abnormal_long_term_variability 0
mean_value_of_long_term_variability 0
histogram_width 0
histogram_min 0
histogram_max 0
histogram_number_of_peaks 0
histogram_number_of_zeroes 0
histogram_mode 0
histogram_mean 0
histogram_median 0
histogram_variance 0
histogram_tendency 0
fetal_health 0
dtype: int64
1.9. Step 3:#
Check for any duplicates and if they are present we keep the first unique value and remove all other duplicates.
data.duplicated().sum()
13
data.drop_duplicates(keep='first', inplace=True)
data.duplicated().sum()
0
1.10. Step 4:#
Perform 3NF and merge back the data into a single dataset as it was intially ,i.e, we have performed data preparation, normalization, and merged our database containing fetal health data into a single csv - fetal_health_from_db.csv.
import sqlite3
import pandas as pd
# Load your dataset
file_path = 'fetal_health.csv' # Update with your file path
df = pd.read_csv(file_path)
# Data cleanup (as done before)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')
df.rename(columns={'prolongued_decelerations': 'prolonged_decelerations'}, inplace=True)
# Create a connection to the SQLite database (this creates a file called 'fetal_health.db')
conn = sqlite3.connect('fetal_health.db')
cursor = conn.cursor()
# Step 1: Create tables in the database
# Measurements table
cursor.execute('''
CREATE TABLE IF NOT EXISTS measurements (
measurement_id INTEGER PRIMARY KEY AUTOINCREMENT,
baseline_value REAL,
accelerations REAL,
fetal_movement REAL,
uterine_contractions REAL,
light_decelerations REAL,
severe_decelerations REAL,
prolonged_decelerations REAL
)
''')
# Histogram table
cursor.execute('''
CREATE TABLE IF NOT EXISTS histogram (
histogram_id INTEGER PRIMARY KEY AUTOINCREMENT,
histogram_width REAL,
histogram_min REAL,
histogram_max REAL,
histogram_number_of_peaks REAL,
histogram_number_of_zeroes REAL,
histogram_mode REAL,
histogram_mean REAL,
histogram_median REAL,
histogram_variance REAL,
histogram_tendency REAL
)
''')
# Variability table
cursor.execute('''
CREATE TABLE IF NOT EXISTS variability (
variability_id INTEGER PRIMARY KEY AUTOINCREMENT,
abnormal_short_term_variability REAL,
mean_value_of_short_term_variability REAL,
percentage_of_time_with_abnormal_long_term_variability REAL,
mean_value_of_long_term_variability REAL
)
''')
# Fetal Health table
cursor.execute('''
CREATE TABLE IF NOT EXISTS fetal_health (
fetal_health_id INTEGER PRIMARY KEY AUTOINCREMENT,
fetal_health INTEGER
)
''')
# Main table (Links all the above tables)
cursor.execute('''
CREATE TABLE IF NOT EXISTS main_table (
record_id INTEGER PRIMARY KEY AUTOINCREMENT,
measurement_id INTEGER,
histogram_id INTEGER,
variability_id INTEGER,
fetal_health_id INTEGER,
FOREIGN KEY (measurement_id) REFERENCES measurements(measurement_id),
FOREIGN KEY (histogram_id) REFERENCES histogram(histogram_id),
FOREIGN KEY (variability_id) REFERENCES variability(variability_id),
FOREIGN KEY (fetal_health_id) REFERENCES fetal_health(fetal_health_id)
)
''')
# Step 2: Insert data into the tables
# Insert data into the 'fetal_health' table (without dropping duplicates)
fetal_health_data = df[['fetal_health']].copy()
fetal_health_data['fetal_health_id'] = range(1, len(fetal_health_data) + 1)
fetal_health_data.to_sql('fetal_health', conn, if_exists='replace', index=False)
# Insert data into the 'measurements' table
measurements_columns = [
'baseline_value', 'accelerations', 'fetal_movement', 'uterine_contractions',
'light_decelerations', 'severe_decelerations', 'prolonged_decelerations'
]
measurements_data = df[measurements_columns].drop_duplicates()
measurements_data['measurement_id'] = range(1, len(measurements_data) + 1)
measurements_data.to_sql('measurements', conn, if_exists='replace', index=False)
# Insert data into the 'histogram' table
histogram_columns = [
'histogram_width', 'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean', 'histogram_median',
'histogram_variance', 'histogram_tendency'
]
histogram_data = df[histogram_columns].drop_duplicates()
histogram_data['histogram_id'] = range(1, len(histogram_data) + 1)
histogram_data.to_sql('histogram', conn, if_exists='replace', index=False)
# Insert data into the 'variability' table
variability_columns = [
'abnormal_short_term_variability', 'mean_value_of_short_term_variability',
'percentage_of_time_with_abnormal_long_term_variability', 'mean_value_of_long_term_variability'
]
variability_data = df[variability_columns].drop_duplicates()
variability_data['variability_id'] = range(1, len(variability_data) + 1)
variability_data.to_sql('variability', conn, if_exists='replace', index=False)
# Insert data into the 'main_table'
main_table_data = pd.DataFrame({
'measurement_id': measurements_data['measurement_id'],
'histogram_id': histogram_data['histogram_id'],
'variability_id': variability_data['variability_id'],
'fetal_health_id': fetal_health_data['fetal_health_id']
})
main_table_data['record_id'] = range(1, len(main_table_data) + 1)
main_table_data.to_sql('main_table', conn, if_exists='replace', index=False)
print("Data inserted into the database and saved as 'fetal_health.db'.")
Data inserted into the database and saved as 'fetal_health.db'.
# Check all tables in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("Tables in the database:")
for table in tables:
print(table[0]) # Print each table's name
Tables in the database:
sqlite_sequence
fetal_health
measurements
histogram
variability
main_table
# Check the structure of a specific table (e.g., 'main_table')
cursor.execute("PRAGMA table_info(main_table);")
columns = cursor.fetchall()
print("\nColumns in 'main_table':")
for column in columns:
print(column)
Columns in 'main_table':
(0, 'measurement_id', 'REAL', 0, None, 0)
(1, 'histogram_id', 'REAL', 0, None, 0)
(2, 'variability_id', 'REAL', 0, None, 0)
(3, 'fetal_health_id', 'INTEGER', 0, None, 0)
(4, 'record_id', 'INTEGER', 0, None, 0)
# Query some rows from the 'main_table' to inspect the data
cursor.execute("SELECT * FROM main_table LIMIT 10;")
rows = cursor.fetchall()
print("\nSample data from 'main_table':")
for row in rows:
print(row)
Sample data from 'main_table':
(1.0, 1.0, 1.0, 1, 1)
(2.0, 2.0, 2.0, 2, 2)
(3.0, 3.0, 3.0, 3, 3)
(4.0, 4.0, 4.0, 4, 4)
(5.0, 5.0, 5.0, 5, 5)
(6.0, 6.0, 6.0, 6, 6)
(7.0, 7.0, 7.0, 7, 7)
(8.0, 8.0, 8.0, 8, 8)
(9.0, None, 9.0, 9, 9)
(10.0, 9.0, 10.0, 10, 10)
# Enable foreign key constraints
cursor.execute("PRAGMA foreign_keys = ON;")
<sqlite3.Cursor at 0x1432ed3c0>
# Check if foreign keys are enabled
cursor.execute("PRAGMA foreign_keys;")
foreign_keys_enabled = cursor.fetchone()
print("\nForeign keys enabled:", foreign_keys_enabled[0] == 1)
Foreign keys enabled: True
# Connect to the SQLite database
import sqlite3
connection = sqlite3.connect("fetal_health.db")
cursor = connection.cursor()
# Update all columns with NULL values to 0.00 in the measurements table
cursor.execute("UPDATE measurements SET baseline_value = 0.00 WHERE baseline_value IS NULL")
cursor.execute("UPDATE measurements SET accelerations = 0.00 WHERE accelerations IS NULL")
cursor.execute("UPDATE measurements SET fetal_movement = 0.00 WHERE fetal_movement IS NULL")
cursor.execute("UPDATE measurements SET uterine_contractions = 0.00 WHERE uterine_contractions IS NULL")
cursor.execute("UPDATE measurements SET light_decelerations = 0.00 WHERE light_decelerations IS NULL")
cursor.execute("UPDATE measurements SET severe_decelerations = 0.00 WHERE severe_decelerations IS NULL")
cursor.execute("UPDATE measurements SET prolonged_decelerations = 0.00 WHERE prolonged_decelerations IS NULL")
# Commit changes and close the connection
<sqlite3.Cursor at 0x1432eedc0>
This is where we work on merging the data back to a single csv
import sqlite3
import csv
# Connect to the SQLite database
conn = sqlite3.connect('fetal_health.db')
cursor = conn.cursor()
# Join all tables based on their foreign key relationships
query = """
SELECT
main_table.record_id,
measurements.*,
histogram.*,
variability.*,
fetal_health.fetal_health
FROM main_table
JOIN measurements ON main_table.measurement_id = measurements.measurement_id
JOIN histogram ON main_table.histogram_id = histogram.histogram_id
JOIN variability ON main_table.variability_id = variability.variability_id
JOIN fetal_health ON main_table.fetal_health_id = fetal_health.fetal_health_id
"""
# Execute the query
cursor.execute(query)
# Fetch the data
rows = cursor.fetchall()
# Get the column names (first get columns from each table)
measurement_columns = [description[0] for description in cursor.description]
# Write to a single CSV file
with open('merged_data.csv', mode='w', newline='') as file:
writer = csv.writer(file)
# Write the header (column names)
writer.writerow(measurement_columns)
# Write the data
writer.writerows(rows)
print("All data exported to 'merged_data.csv'")
All data exported to 'merged_data.csv'
import sqlite3
import pandas as pd
# Create a connection to the SQLite database (using the existing database file)
conn = sqlite3.connect('fetal_health.db')
# Query data from each table and convert it to a DataFrame
measurements_df = pd.read_sql_query("SELECT * FROM measurements", conn)
histogram_df = pd.read_sql_query("SELECT * FROM histogram", conn)
variability_df = pd.read_sql_query("SELECT * FROM variability", conn)
fetal_health_df = pd.read_sql_query("SELECT * FROM fetal_health", conn)
main_table_df = pd.read_sql_query("SELECT * FROM main_table", conn)
# Check for NaN values in main_table for 'fetal_health_id'
print(main_table_df['fetal_health_id'].isna().sum()) # Check how many NaN values in fetal_health_id
# Check if 'fetal_health_id' values in the main table match with those in the fetal_health table
print(fetal_health_df['fetal_health_id'].unique()) # Check unique IDs in fetal_health
print(main_table_df['fetal_health_id'].unique()) # Check unique IDs in main_table
# Clean up 'fetal_health_id' in main_table by filling NaN with an appropriate value (e.g., 0 or remove rows with NaN)
# Optionally, you can choose to drop rows with NaN in 'fetal_health_id' if that’s suitable
main_table_df = main_table_df.dropna(subset=['fetal_health_id'])
# Ensure 'fetal_health_id' is an integer type to avoid mismatches
main_table_df['fetal_health_id'] = main_table_df['fetal_health_id'].astype(int)
# Merge the data into a single DataFrame based on foreign key relationships
merged_df = main_table_df \
.merge(measurements_df, on='measurement_id', how='left') \
.merge(histogram_df, on='histogram_id', how='left') \
.merge(variability_df, on='variability_id', how='left') \
.merge(fetal_health_df, on='fetal_health_id', how='left')
# Check if the merge worked and there are no NaN values in 'fetal_health_id' and 'fetal_health'
print(merged_df[['fetal_health_id', 'fetal_health']].head(10)) # Print the first few rows
# Save the merged DataFrame to a single CSV file
merged_df.to_csv('merged_fetal_health.csv', index=False)
# Close the connection
conn.close()
print("Data has been exported to a single CSV file: 'merged_fetal_health.csv'.")
0
[ 1 2 3 ... 2124 2125 2126]
[ 1 2 3 ... 2124 2125 2126]
fetal_health_id fetal_health
0 1 2.0
1 2 1.0
2 3 1.0
3 4 1.0
4 5 1.0
5 6 3.0
6 7 3.0
7 8 3.0
8 9 3.0
9 10 3.0
Data has been exported to a single CSV file: 'merged_fetal_health.csv'.
import sqlite3
# Create a connection to the SQLite database
conn = sqlite3.connect('fetal_health.db')
# Query to retrieve fetal_health_id and fetal_health from the fetal_health table
query = "SELECT fetal_health_id, fetal_health FROM fetal_health"
# Execute the query and fetch the results into a DataFrame
fetal_health_df = pd.read_sql_query(query, conn)
# Display the retrieved data
print(fetal_health_df)
# Close the connection
conn.close()
fetal_health_id fetal_health
0 1 2.0
1 2 1.0
2 3 1.0
3 4 1.0
4 5 1.0
... ... ...
2121 2122 2.0
2122 2123 2.0
2123 2124 2.0
2124 2125 2.0
2125 2126 1.0
[2126 rows x 2 columns]
import sqlite3
import pandas as pd
# Connect to the SQLite database
conn = sqlite3.connect('fetal_health.db')
# Create a query to join all the tables and get the required columns
query = """
SELECT
main_table.record_id,
measurements.baseline_value,
measurements.accelerations,
measurements.fetal_movement,
measurements.uterine_contractions,
measurements.light_decelerations,
measurements.severe_decelerations,
measurements.prolonged_decelerations,
histogram.histogram_width,
histogram.histogram_min,
histogram.histogram_max,
histogram.histogram_number_of_peaks,
histogram.histogram_number_of_zeroes,
histogram.histogram_mode,
histogram.histogram_mean,
histogram.histogram_median,
histogram.histogram_variance,
histogram.histogram_tendency,
variability.abnormal_short_term_variability,
variability.mean_value_of_short_term_variability,
variability.percentage_of_time_with_abnormal_long_term_variability,
variability.mean_value_of_long_term_variability,
fetal_health.fetal_health
FROM main_table
JOIN measurements ON main_table.measurement_id = measurements.measurement_id
JOIN histogram ON main_table.histogram_id = histogram.histogram_id
JOIN variability ON main_table.variability_id = variability.variability_id
JOIN fetal_health ON main_table.fetal_health_id = fetal_health.fetal_health_id
"""
# Execute the query and fetch the data into a pandas DataFrame
df_from_db = pd.read_sql(query, conn)
# Optionally clean column names to match the original CSV (if needed)
df_from_db.columns = df_from_db.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')
# Save the resulting DataFrame as a CSV
df_from_db.to_csv('fetal_health_from_db.csv', index=False)
# Close the database connection
conn.close()
print("Database has been successfully converted to 'fetal_health_from_db.csv'.")
Database has been successfully converted to 'fetal_health_from_db.csv'.
Here is the dataset we achieve from the database.
df= pd.read_csv('fetal_health_from_db.csv')
df[:23]
| record_id | baseline_value | accelerations | fetal_movement | uterine_contractions | light_decelerations | severe_decelerations | prolonged_decelerations | histogram_width | histogram_min | ... | histogram_mode | histogram_mean | histogram_median | histogram_variance | histogram_tendency | abnormal_short_term_variability | mean_value_of_short_term_variability | percentage_of_time_with_abnormal_long_term_variability | mean_value_of_long_term_variability | fetal_health | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 120.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 64.0 | 62.0 | ... | 120.0 | 137.0 | 121.0 | 73.0 | 1.0 | 73.0 | 0.5 | 43.0 | 2.4 | 2.0 |
| 1 | 2 | 132.0 | 0.006 | 0.000 | 0.006 | 0.003 | 0.0 | 0.000 | 130.0 | 68.0 | ... | 141.0 | 136.0 | 140.0 | 12.0 | 0.0 | 17.0 | 2.1 | 0.0 | 10.4 | 1.0 |
| 2 | 3 | 133.0 | 0.003 | 0.000 | 0.008 | 0.003 | 0.0 | 0.000 | 130.0 | 68.0 | ... | 141.0 | 135.0 | 138.0 | 13.0 | 0.0 | 16.0 | 2.1 | 0.0 | 13.4 | 1.0 |
| 3 | 4 | 134.0 | 0.003 | 0.000 | 0.008 | 0.003 | 0.0 | 0.000 | 117.0 | 53.0 | ... | 137.0 | 134.0 | 137.0 | 13.0 | 1.0 | 16.0 | 2.4 | 0.0 | 23.0 | 1.0 |
| 4 | 5 | 132.0 | 0.007 | 0.000 | 0.008 | 0.000 | 0.0 | 0.000 | 117.0 | 53.0 | ... | 137.0 | 136.0 | 138.0 | 11.0 | 1.0 | 16.0 | 2.4 | 0.0 | 19.9 | 1.0 |
| 5 | 6 | 134.0 | 0.001 | 0.000 | 0.010 | 0.009 | 0.0 | 0.002 | 150.0 | 50.0 | ... | 76.0 | 107.0 | 107.0 | 170.0 | 0.0 | 26.0 | 5.9 | 0.0 | 0.0 | 3.0 |
| 6 | 7 | 134.0 | 0.001 | 0.000 | 0.013 | 0.008 | 0.0 | 0.003 | 150.0 | 50.0 | ... | 71.0 | 107.0 | 106.0 | 215.0 | 0.0 | 29.0 | 6.3 | 0.0 | 0.0 | 3.0 |
| 7 | 8 | 122.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 68.0 | 62.0 | ... | 122.0 | 122.0 | 123.0 | 3.0 | 1.0 | 83.0 | 0.5 | 6.0 | 15.6 | 3.0 |
| 8 | 10 | 122.0 | 0.000 | 0.000 | 0.003 | 0.000 | 0.0 | 0.000 | 68.0 | 62.0 | ... | 122.0 | 122.0 | 123.0 | 1.0 | 1.0 | 86.0 | 0.3 | 6.0 | 10.6 | 3.0 |
| 9 | 11 | 151.0 | 0.000 | 0.000 | 0.001 | 0.001 | 0.0 | 0.000 | 130.0 | 56.0 | ... | 150.0 | 148.0 | 151.0 | 9.0 | 1.0 | 64.0 | 1.9 | 9.0 | 27.6 | 2.0 |
| 10 | 12 | 150.0 | 0.000 | 0.000 | 0.001 | 0.001 | 0.0 | 0.000 | 130.0 | 56.0 | ... | 150.0 | 148.0 | 151.0 | 10.0 | 1.0 | 64.0 | 2.0 | 8.0 | 29.5 | 2.0 |
| 11 | 13 | 131.0 | 0.005 | 0.072 | 0.008 | 0.003 | 0.0 | 0.000 | 66.0 | 88.0 | ... | 135.0 | 134.0 | 137.0 | 7.0 | 1.0 | 28.0 | 1.4 | 0.0 | 12.9 | 1.0 |
| 12 | 14 | 131.0 | 0.009 | 0.222 | 0.006 | 0.002 | 0.0 | 0.000 | 87.0 | 71.0 | ... | 141.0 | 137.0 | 141.0 | 10.0 | 1.0 | 28.0 | 1.5 | 0.0 | 5.4 | 1.0 |
| 13 | 15 | 130.0 | 0.006 | 0.408 | 0.004 | 0.005 | 0.0 | 0.001 | 107.0 | 67.0 | ... | 143.0 | 125.0 | 135.0 | 76.0 | 0.0 | 21.0 | 2.3 | 0.0 | 7.9 | 1.0 |
| 14 | 16 | 130.0 | 0.006 | 0.380 | 0.004 | 0.004 | 0.0 | 0.001 | 107.0 | 67.0 | ... | 134.0 | 127.0 | 133.0 | 43.0 | 0.0 | 19.0 | 2.3 | 0.0 | 8.7 | 1.0 |
| 15 | 17 | 130.0 | 0.006 | 0.441 | 0.005 | 0.005 | 0.0 | 0.000 | 125.0 | 53.0 | ... | 143.0 | 128.0 | 138.0 | 70.0 | 1.0 | 24.0 | 2.1 | 0.0 | 10.9 | 1.0 |
| 16 | 18 | 131.0 | 0.002 | 0.383 | 0.003 | 0.005 | 0.0 | 0.002 | 107.0 | 67.0 | ... | 134.0 | 125.0 | 132.0 | 45.0 | 0.0 | 18.0 | 2.4 | 0.0 | 13.9 | 2.0 |
| 17 | 19 | 130.0 | 0.003 | 0.451 | 0.006 | 0.004 | 0.0 | 0.001 | 99.0 | 59.0 | ... | 133.0 | 124.0 | 129.0 | 36.0 | 1.0 | 23.0 | 1.9 | 0.0 | 8.8 | 1.0 |
| 18 | 20 | 130.0 | 0.005 | 0.469 | 0.005 | 0.004 | 0.0 | 0.001 | 112.0 | 65.0 | ... | 133.0 | 129.0 | 133.0 | 27.0 | 0.0 | 29.0 | 1.7 | 0.0 | 7.8 | 1.0 |
| 19 | 21 | 129.0 | 0.000 | 0.340 | 0.004 | 0.002 | 0.0 | 0.003 | 128.0 | 54.0 | ... | 129.0 | 104.0 | 120.0 | 138.0 | 0.0 | 30.0 | 2.1 | 0.0 | 8.5 | 3.0 |
| 20 | 22 | 128.0 | 0.005 | 0.425 | 0.003 | 0.003 | 0.0 | 0.002 | 141.0 | 57.0 | ... | 129.0 | 125.0 | 132.0 | 34.0 | 0.0 | 26.0 | 1.7 | 0.0 | 6.7 | 1.0 |
| 21 | 23 | 128.0 | 0.000 | 0.334 | 0.003 | 0.003 | 0.0 | 0.003 | 145.0 | 54.0 | ... | 75.0 | 99.0 | 102.0 | 148.0 | -1.0 | 34.0 | 2.5 | 0.0 | 4.0 | 3.0 |
| 22 | 24 | 128.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.000 | 16.0 | 114.0 | ... | 126.0 | 124.0 | 125.0 | 1.0 | 1.0 | 80.0 | 0.5 | 0.0 | 6.8 | 3.0 |
23 rows × 23 columns
df
| record_id | baseline_value | accelerations | fetal_movement | uterine_contractions | light_decelerations | severe_decelerations | prolonged_decelerations | histogram_width | histogram_min | ... | histogram_mode | histogram_mean | histogram_median | histogram_variance | histogram_tendency | abnormal_short_term_variability | mean_value_of_short_term_variability | percentage_of_time_with_abnormal_long_term_variability | mean_value_of_long_term_variability | fetal_health | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 120.0 | 0.000 | 0.000 | 0.000 | 0.000 | 0.0 | 0.0 | 64.0 | 62.0 | ... | 120.0 | 137.0 | 121.0 | 73.0 | 1.0 | 73.0 | 0.5 | 43.0 | 2.4 | 2.0 |
| 1 | 2 | 132.0 | 0.006 | 0.000 | 0.006 | 0.003 | 0.0 | 0.0 | 130.0 | 68.0 | ... | 141.0 | 136.0 | 140.0 | 12.0 | 0.0 | 17.0 | 2.1 | 0.0 | 10.4 | 1.0 |
| 2 | 3 | 133.0 | 0.003 | 0.000 | 0.008 | 0.003 | 0.0 | 0.0 | 130.0 | 68.0 | ... | 141.0 | 135.0 | 138.0 | 13.0 | 0.0 | 16.0 | 2.1 | 0.0 | 13.4 | 1.0 |
| 3 | 4 | 134.0 | 0.003 | 0.000 | 0.008 | 0.003 | 0.0 | 0.0 | 117.0 | 53.0 | ... | 137.0 | 134.0 | 137.0 | 13.0 | 1.0 | 16.0 | 2.4 | 0.0 | 23.0 | 1.0 |
| 4 | 5 | 132.0 | 0.007 | 0.000 | 0.008 | 0.000 | 0.0 | 0.0 | 117.0 | 53.0 | ... | 137.0 | 136.0 | 138.0 | 11.0 | 1.0 | 16.0 | 2.4 | 0.0 | 19.9 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1703 | 2121 | 140.0 | 0.000 | 0.000 | 0.005 | 0.001 | 0.0 | 0.0 | 31.0 | 124.0 | ... | 145.0 | 143.0 | 145.0 | 2.0 | 0.0 | 77.0 | 0.7 | 17.0 | 6.0 | 1.0 |
| 1704 | 2122 | 140.0 | 0.000 | 0.000 | 0.007 | 0.000 | 0.0 | 0.0 | 40.0 | 137.0 | ... | 153.0 | 150.0 | 152.0 | 2.0 | 0.0 | 79.0 | 0.2 | 25.0 | 7.2 | 2.0 |
| 1705 | 2123 | 140.0 | 0.001 | 0.000 | 0.007 | 0.000 | 0.0 | 0.0 | 66.0 | 103.0 | ... | 152.0 | 148.0 | 151.0 | 3.0 | 1.0 | 78.0 | 0.4 | 22.0 | 7.1 | 2.0 |
| 1706 | 2125 | 140.0 | 0.001 | 0.000 | 0.006 | 0.000 | 0.0 | 0.0 | 66.0 | 103.0 | ... | 152.0 | 147.0 | 151.0 | 4.0 | 1.0 | 78.0 | 0.4 | 27.0 | 7.0 | 2.0 |
| 1707 | 2126 | 142.0 | 0.002 | 0.002 | 0.008 | 0.000 | 0.0 | 0.0 | 42.0 | 117.0 | ... | 145.0 | 143.0 | 145.0 | 1.0 | 0.0 | 74.0 | 0.4 | 36.0 | 5.0 | 1.0 |
1708 rows × 23 columns
1.11. Step 5:#
Check the new csv file achived from the database for any inconsistencies and errors. To do so we use ydata_profiling as shown below :
pip install ydata_profiling
Requirement already satisfied: ydata_profiling in /opt/anaconda3/lib/python3.12/site-packages (4.12.1)
Requirement already satisfied: scipy<1.14,>=1.4.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (1.13.1)
Requirement already satisfied: pandas!=1.4.0,<3,>1.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (2.2.2)
Requirement already satisfied: matplotlib<3.10,>=3.5 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (3.9.2)
Requirement already satisfied: pydantic>=2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (2.8.2)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (6.0.1)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (3.1.4)
Requirement already satisfied: visions<0.7.7,>=0.7.5 in /opt/anaconda3/lib/python3.12/site-packages (from visions[type_image_path]<0.7.7,>=0.7.5->ydata_profiling) (0.7.6)
Requirement already satisfied: numpy<2.2,>=1.16.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (1.26.4)
Requirement already satisfied: htmlmin==0.1.12 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (0.1.12)
Requirement already satisfied: phik<0.13,>=0.11.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (0.12.4)
Requirement already satisfied: requests<3,>=2.24.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (2.32.3)
Requirement already satisfied: tqdm<5,>=4.48.2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (4.66.5)
Requirement already satisfied: seaborn<0.14,>=0.10.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (0.13.2)
Requirement already satisfied: multimethod<2,>=1.4 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (1.12)
Requirement already satisfied: statsmodels<1,>=0.13.2 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (0.14.2)
Requirement already satisfied: typeguard<5,>=3 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (4.4.1)
Requirement already satisfied: imagehash==4.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (4.3.1)
Requirement already satisfied: wordcloud>=1.9.3 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (1.9.4)
Collecting dacite>=1.8 (from ydata_profiling)
Using cached dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Requirement already satisfied: numba<1,>=0.56.0 in /opt/anaconda3/lib/python3.12/site-packages (from ydata_profiling) (0.60.0)
Requirement already satisfied: PyWavelets in /opt/anaconda3/lib/python3.12/site-packages (from imagehash==4.3.1->ydata_profiling) (1.7.0)
Requirement already satisfied: pillow in /opt/anaconda3/lib/python3.12/site-packages (from imagehash==4.3.1->ydata_profiling) (10.4.0)
Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2<3.2,>=2.11.1->ydata_profiling) (2.1.3)
Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (1.4.4)
Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (24.1)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<3.10,>=3.5->ydata_profiling) (2.9.0.post0)
Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /opt/anaconda3/lib/python3.12/site-packages (from numba<1,>=0.56.0->ydata_profiling) (0.43.0)
Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas!=1.4.0,<3,>1.1->ydata_profiling) (2023.3)
Requirement already satisfied: joblib>=0.14.1 in /opt/anaconda3/lib/python3.12/site-packages (from phik<0.13,>=0.11.1->ydata_profiling) (1.4.2)
Requirement already satisfied: annotated-types>=0.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata_profiling) (0.6.0)
Requirement already satisfied: pydantic-core==2.20.1 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata_profiling) (2.20.1)
Requirement already satisfied: typing-extensions>=4.6.1 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic>=2->ydata_profiling) (4.11.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata_profiling) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata_profiling) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata_profiling) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3,>=2.24.0->ydata_profiling) (2024.8.30)
Requirement already satisfied: patsy>=0.5.6 in /opt/anaconda3/lib/python3.12/site-packages (from statsmodels<1,>=0.13.2->ydata_profiling) (0.5.6)
Requirement already satisfied: attrs>=19.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from visions<0.7.7,>=0.7.5->visions[type_image_path]<0.7.7,>=0.7.5->ydata_profiling) (23.1.0)
Requirement already satisfied: networkx>=2.4 in /opt/anaconda3/lib/python3.12/site-packages (from visions<0.7.7,>=0.7.5->visions[type_image_path]<0.7.7,>=0.7.5->ydata_profiling) (3.3)
Requirement already satisfied: six in /opt/anaconda3/lib/python3.12/site-packages (from patsy>=0.5.6->statsmodels<1,>=0.13.2->ydata_profiling) (1.16.0)
Using cached dacite-1.8.1-py3-none-any.whl (14 kB)
Installing collected packages: dacite
Attempting uninstall: dacite
Found existing installation: dacite 1.6.0
Uninstalling dacite-1.6.0:
Successfully uninstalled dacite-1.6.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dagshub 0.4.0 requires dacite~=1.6.0, but you have dacite 1.8.1 which is incompatible.
Successfully installed dacite-1.8.1
Note: you may need to restart the kernel to use updated packages.
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Report on fetal health csv", explorative=True)
profile